# Clear memory 
rm(list = ls())

# Load packages
if (!require("pacman")) install.packages("pacman")
pacman::p_load(
  data.table, janitor,
  magrittr, lubridate, foreign, sp, zoo, 
  broom, readxl, haven, tidyverse, tidylog, 
  readxl, sf, maps, tigris
)
options("tidylog.display" = NULL)
select <- dplyr::select

# This file was originally used for a deprecated part of the paper and
# is now only used for computing dollar value welfare results

# beginning and ending year of sample
initial_year <- 1986
final_year <- 2001

# trade elasticity
trade_elasticity <- 4

# load data and create panel for productivity estimation
bea_df <- read_dta("data/BEA/long_1969_2018.dta") |> 
  mutate(fips = str_pad(fips, 5, "left", pad = "0")) |> 
  filter(year >= initial_year & year <= final_year) %>% 
  filter(industry == "Manufacturing" | industry == "Remaining Industries")

ma_mfg = read_csv("data/tradecosts/MA_mfg.csv") |> 
  set_colnames(c("fips", 1980:2000)) |> 
  mutate(fips = str_pad(fips, 5, "left", pad = "0")) |> 
  pivot_longer(
    cols = -fips,
    names_to = "year",
    values_to = "ma"
  ) |> 
  mutate(industry = "Manufacturing",
         year = as.numeric(year))

ma_other = read_csv("data/tradecosts/MA_rem.csv") |> 
  set_colnames(c("fips", 1980:2000)) |> 
  mutate(fips = str_pad(fips, 5, "left", pad = "0")) |> 
  pivot_longer(
    cols = -fips,
    names_to = "year",
    values_to = "ma"
  ) |> 
  mutate(industry = "Remaining Industries",
         year = as.numeric(year))

ma_df = bind_rows(ma_mfg, ma_other)

caa_df <- fread("data/greenbook/nonattainment_status.csv") |> 
  mutate(fips = str_pad(fips, 5, "left", pad = "0"))

df_out <- left_join(bea_df, ma_df) %>% 
  left_join(caa_df) %>% 
  filter(industry == "Manufacturing" | industry == "Remaining Industries")

fwrite(df_out, "data/data-full-panel.csv")
name_storage = names(df_out)

fips <- fread("data/data-full-panel.csv") %>%
  as_tibble() %>% 
  filter(year == 1990 & industry == "Manufacturing") %>%
  select(fips) %>%
  mutate(fips = str_pad(fips, 5, "left", "0")) 

full_df <- fread("data/data-full-panel.csv") %>%
  as_tibble() %>%
  mutate(fips = str_pad(fips, 5, "left", "0")) %>% 
  filter(year >= initial_year & year <= final_year)

first_nonattain <- full_df %>%
  rowwise() %>%
  mutate(sum_nonattainment = max(
    nonattainment_o3, nonattainment_co, nonattainment_pm10, nonattainment_so2
  )) %>%
  arrange(fips, sum_nonattainment, year) %>%
  group_by(fips, sum_nonattainment) %>%
  mutate(first_nonattain = first(year)) %>%
  ungroup() %>%
  # counties that went into nonattainment in 1991 did so at the very end of the year
  mutate(first_nonattain = ifelse(first_nonattain == 1990, 1991, first_nonattain)) %>% 
  filter(sum_nonattainment == 1) %>%
  select(fips, first_nonattain) %>%
  distinct() %>%
  mutate(treated = TRUE)

df <- full_df %>%
  left_join(first_nonattain) %>%
  mutate(treated = ifelse(is.na(treated), FALSE, TRUE)) %>%
  arrange(fips, year) %>%
  rowwise() %>%
  mutate(sum_nonattainment = max(
    nonattainment_o3, nonattainment_co, nonattainment_pm10, nonattainment_so2
  )) %>%
  arrange(fips, year) %>%
  group_by(industry) %>%
  mutate(industry_id = as.factor(cur_group_id())) %>%
  ungroup() %>%
  mutate(manufacturing = industry == "Manufacturing") %>%
  arrange(fips, year) %>%
  mutate(first_non_1990 = ifelse(first_nonattain >= 1991, first_nonattain, NA),
         event_time = year - first_non_1990,
         event_time = ifelse(!is.na(first_non_1990), event_time, -1),
         ones = 1) %>%
  group_by(event_time) %>% 
  mutate(event_obs = n()) %>% 
  ungroup() %>% 
  mutate(balanced_time = event_obs == sort(unique(event_obs), decreasing = T)[2]) %>% 
  group_by(balanced_time) %>% 
  mutate(upper_cap = max(event_time),
         upper_cap = ifelse(upper_cap > 5, 5, upper_cap),
         lower_cap = min(event_time), 
         lower_cap = ifelse(lower_cap < -5, -5, lower_cap)) %>% 
  mutate(upper_cap = 4,
         lower_cap = -4) %>%
  ungroup() %>% 
  mutate(upper_cap = min(upper_cap),
         lower_cap = max(lower_cap)) %>% 
  mutate(
    event_time = case_when(
      event_time > upper_cap ~ upper_cap + 1,
      event_time < lower_cap ~ lower_cap - 1,
      TRUE ~ event_time
    ),
    event_time_manu = ifelse(manufacturing, event_time, -1)
  ) %>% 
  mutate(post_nonattainment = event_time >= 1,
         treated = post_nonattainment & manufacturing) 

df$trade_elasticity <- trade_elasticity

# Industry 4 is manufacturing, 2 is remaining
levels(df$industry_id) <- c(seq(5, 1, -1))
levels(df$manufacturing) <- c(seq(5, 1, -1))

# last edits
df = df %>% 
  group_by(fips, year) %>% 
  mutate(emp_ratio = emp/sum(emp)) %>% 
  ungroup()

fwrite(df, "data/productivity_panel.csv")

first_nonattain <- full_df %>%
  rowwise() %>%
  pivot_longer(
    cols = nonattainment_so2:nonattainment_co,
    names_to = "pollutant",
    names_prefix = "nonattainment_",
    values_to = "nonattainment_pol"
  ) %>% 
  arrange(fips, pollutant, nonattainment_pol, year) %>%
  group_by(fips, pollutant, nonattainment_pol) %>%
  mutate(first_nonattain = first(year)) %>%
  ungroup() %>%
  select(-nonattainment,-sum_nonattainment) %>% 
  # counties that went into nonattainment in 1990 did so at the very end of the year
  mutate(first_nonattain = ifelse(first_nonattain == 1990, 1991, first_nonattain)) %>% 
  filter(nonattainment_pol > 0) %>%
  select(fips, first_nonattain, pollutant) %>%
  distinct() %>%
  mutate(treated = TRUE) %>% 
  pivot_wider(
    names_from = "pollutant",
    names_prefix = "first_nonattain_",
    values_from = "first_nonattain"
  )

df <- full_df %>%
  left_join(first_nonattain) %>%
  mutate(treated = ifelse(is.na(treated), FALSE, TRUE)) %>%
  arrange(fips, year) %>%
  group_by(industry) %>%
  mutate(industry_id = as.factor(cur_group_id())) %>%
  ungroup() %>%
  mutate(manufacturing = industry == "Manufacturing") %>%
  arrange(fips, year) %>%
  mutate(first_non_1990_co = ifelse(first_nonattain_co >= 1991, first_nonattain_co, NA),
         first_non_1990_o3 = ifelse(first_nonattain_o3 >= 1991, first_nonattain_o3, NA),
         first_non_1990_pm10 = ifelse(first_nonattain_pm10 >= 1991, first_nonattain_pm10, NA),
         first_non_1990_so2 = ifelse(first_nonattain_so2 >= 1991, first_nonattain_so2, NA)) %>% 
  mutate(treated_co = (year - first_non_1990_co > 1) & manufacturing,
         treated_o3 = (year - first_non_1990_o3 > 1) & manufacturing, 
         treated_pm10 = (year - first_non_1990_pm10 > 1) & manufacturing, 
         treated_so2 = (year - first_non_1990_so2 > 1) & manufacturing,
         post_co = (year - first_non_1990_co > 1),
         post_o3 = (year - first_non_1990_o3 > 1),
         post_pm10 = (year - first_non_1990_pm10 > 1),
         post_so2 = (year - first_non_1990_so2 > 1)) %>% 
  replace_na(list(treated_co = 0, treated_o3 = 0, treated_pm10 = 0, treated_so2 = 0,
                  post_co = FALSE, post_o3 = FALSE, post_pm10 = FALSE, post_so2 = FALSE))

fwrite(df, "data/productivity_panel_by_pollutant.csv")

# ================================
# For constructing time series of length of stay in nonattainment
# ================================

# beginning and ending year of sample
initial_year <- 1986
final_year <- 2018

# trade elasticity
trade_elasticity <- 4

# load data and create panel for productivity estimation
bea_df <- read_dta("data/BEA/long_1969_2018.dta") |> 
    mutate(fips = str_pad(fips, 5, "left", pad = "0")) |> 
    filter(year >= initial_year & year <= final_year) %>% 
    filter(industry == "Manufacturing" | industry == "Remaining Industries")


ma_mfg = read_csv("data/tradecosts/MA_mfg.csv") |> 
  set_colnames(c("fips", 1980:2000)) |> 
  mutate(fips = str_pad(fips, 5, "left", pad = "0")) |> 
  pivot_longer(
    cols = -fips,
    names_to = "year",
    values_to = "ma"
  ) |> 
  mutate(industry = "Manufacturing",
         year = as.numeric(year))

ma_other = read_csv("data/tradecosts/MA_rem.csv") |> 
  set_colnames(c("fips", 1980:2000)) |> 
  mutate(fips = str_pad(fips, 5, "left", pad = "0")) |> 
  pivot_longer(
    cols = -fips,
    names_to = "year",
    values_to = "ma"
  ) |> 
  mutate(industry = "Remaining Industries",
         year = as.numeric(year))

ma_df = bind_rows(ma_mfg, ma_other)

caa_df <- fread("data/greenbook/nonattainment_status.csv") |> 
  mutate(fips = str_pad(fips, 5, "left", pad = "0"))

df_out <- left_join(bea_df, ma_df) %>% 
  left_join(caa_df) %>% 
  filter(industry == "Manufacturing" | industry == "Remaining Industries")


fwrite(df_out, "data/data_full_panel_time_series.csv")


fips <- df_out %>%
  as_tibble() %>% 
  filter(year == 1990 & industry == "Manufacturing") %>%
  select(fips) %>%
  mutate(fips = str_pad(fips, 5, "left", "0")) 

full_df <- df_out %>%
  as_tibble() %>%
  mutate(fips = str_pad(fips, 5, "left", "0")) %>% 
  filter(year >= initial_year & year <= final_year)

first_nonattain <- full_df %>%
  rowwise() %>%
  mutate(sum_nonattainment = max(
    nonattainment_o3, nonattainment_co, nonattainment_pm10, nonattainment_so2
  )) %>%
  arrange(fips, sum_nonattainment, year) %>%
  group_by(fips, sum_nonattainment) %>%
  mutate(first_nonattain = first(year)) %>%
  ungroup() %>%
  # counties that went into nonattainment in 1991 did so at the very end of the year
  mutate(first_nonattain = ifelse(first_nonattain == 1990, 1991, first_nonattain)) %>% 
  filter(sum_nonattainment == 1) %>%
  select(fips, first_nonattain) %>%
  distinct() %>%
  mutate(treated = TRUE)

df <- full_df %>%
  left_join(first_nonattain) %>%
  mutate(treated = ifelse(is.na(treated), FALSE, TRUE)) %>%
  arrange(fips, year) %>%
  rowwise() %>%
  mutate(sum_nonattainment = max(
    nonattainment_o3, nonattainment_co, nonattainment_pm10, nonattainment_so2
  )) %>%
  arrange(fips, year) %>%
  group_by(industry) %>%
  mutate(industry_id = as.factor(cur_group_id())) %>%
  ungroup() %>%
  mutate(manufacturing = industry == "Manufacturing") %>%
  arrange(fips, year) %>%
  mutate(first_non_1990 = ifelse(first_nonattain >= 1991, first_nonattain, NA),
         event_time = year - first_non_1990,
         event_time = ifelse(!is.na(first_non_1990), event_time, -1),
         ones = 1) %>%
  group_by(event_time) %>% 
  mutate(event_obs = n()) %>% 
  ungroup() %>% 
  mutate(balanced_time = event_obs == sort(unique(event_obs), decreasing = T)[2]) %>% 
  group_by(balanced_time) %>% 
  mutate(upper_cap = max(event_time),
         upper_cap = ifelse(upper_cap > 5, 5, upper_cap),
         lower_cap = min(event_time), 
         lower_cap = ifelse(lower_cap < -5, -5, lower_cap)) %>% 
  mutate(upper_cap = 4,
         lower_cap = -4) %>%
  ungroup() %>% 
  mutate(upper_cap = min(upper_cap),
         lower_cap = max(lower_cap)) %>% 
  mutate(
    event_time = case_when(
      event_time > upper_cap ~ upper_cap + 1,
      event_time < lower_cap ~ lower_cap - 1,
      TRUE ~ event_time
    ),
    event_time_manu = ifelse(manufacturing, event_time, -1)
  ) %>% 
  mutate(post_nonattainment = event_time >= 1,
         treated = post_nonattainment & manufacturing) 

df$trade_elasticity <- trade_elasticity

# Industry 4 is manufacturing, 2 is remaining
levels(df$industry_id) <- c(seq(5, 1, -1))
levels(df$manufacturing) <- c(seq(5, 1, -1))

# last edits
df = df %>% 
  group_by(fips, year) %>% 
  mutate(emp_ratio = emp/sum(emp)) %>% 
  ungroup()

fwrite(df, "data/productivity_panel_time_series.csv")